1 Data

Information about the test:

question description
z1 linear function
z2 3D
z3 arithmetic rules
z4 Easy ineq.
z5 logs
z6 logs
z7 graph translation
z8 sine pi/3
z9 trig.ineq.
z10 trig.identity
z11 period
z12 rational exponents
z13 ellipsoid
z14 limit
z15 series
z16 diff.quotient
z17 graph f'
z18 product rule
z19 quotient rule
z20 (exp)'
z21 (ln(sin))'
z22 hyp.functions
z23 slope tangent
z24 IVT
z25 velocity
z26 int(poly)
z27 int(exp)
z28 Int = 0
z29 int even funct
z31 int(abs)
z32 FtoC algebra
z33 difference vectors
z35 intersect planes
z36 parallel planes
z30 FtoC graph
z34 normal vector

Load the student scores for the test - here we load the 2017 and 2018 ETH Zurich test data:

test_scores
## # A tibble: 3,433 × 38
##    year  class    z1    z2    z3    z4    z5    z6    z7    z8    z9   z10   z11
##    <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2017  s21t…     1     0     1     1     1     0     1     0     1     0     1
##  2 2017  s21t…     1     0     1     1     1     1     0     1     1     2     1
##  3 2017  s21t…     1     0     0     0     1     1     1     0     1     1     1
##  4 2017  s21t…     1     0     1     1     1     1     1     1     1     1     1
##  5 2017  s21t…     1     0     1     0     2     0     1     0     1     0     2
##  6 2017  s21t…     0     1     0     0     1     2     0     2     2     2     1
##  7 2017  s21t…     1     0     1     0     2     1     0     1     2     1     0
##  8 2017  s21t…     1     1     1     1     2     1     1     2     2     2     2
##  9 2017  s21t…     1     1     0     1     1     1     1     1     1     1     1
## 10 2017  s21t…     1     0     1     0     0     1     0     0     1     1     1
## # … with 3,423 more rows, and 25 more variables: z12 <dbl>, z13 <dbl>,
## #   z14 <dbl>, z15 <dbl>, z16 <dbl>, z17 <dbl>, z18 <dbl>, z19 <dbl>,
## #   z20 <dbl>, z21 <dbl>, z22 <dbl>, z23 <dbl>, z24 <dbl>, z25 <dbl>,
## #   z26 <dbl>, z27 <dbl>, z28 <dbl>, z29 <dbl>, z30 <dbl>, z31 <dbl>,
## #   z32 <dbl>, z33 <dbl>, z34 <dbl>, z35 <dbl>, z36 <dbl>

1.1 Missing data

The data includes scores of 2 for the “I don’t know” answer option. We replace these with 0, reflecting a non-correct answer:

test_scores <- test_scores %>% 
  mutate(across(starts_with("z"), ~ ifelse(. == 2, 0, .)))

1.2 Data summary

The number of responses from each class:

test_scores <- test_scores %>% 
  rowwise() %>% 
  mutate(Total = rowSums(across(starts_with("z"))))

test_scores_summary <- test_scores %>% 
  group_by(year) %>% 
  summarise(
    n = n(),
    mean = mean(Total),
    sd = sd(Total),
    median = median(Total)
  )

test_scores_summary %>% 
  gt() %>% 
  fmt_number(columns = c("mean", "sd"), decimals = 2) %>%
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
year n mean sd median
2017 1682 18.68 8.44 19
2018 1751 19.03 8.20 19
p1 <- test_scores %>% 
  ggplot(aes(x = Total)) +
  geom_histogram(binwidth = 2) +
  #scale_x_continuous(limits = c(0,100), breaks = c(0, 50, 100)) +
  facet_grid(cols = vars(year)) +
  theme_minimal() +
  labs(x = "Total score (out of 36)",
       y = "Number of students",
       title = "ETH s21t") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank())

p2 <- test_scores_summary %>% 
  mutate(
    n = str_glue("{n}"),
    mean = str_glue("{round(mean, digits = 1)}"),
    sd = str_glue("{round(sd, digits = 1)}"),
    median = str_glue("{median}")
  ) %>% 
  pivot_longer(c(n, mean, sd, median), names_to = "layer", values_to = "label") %>% 
  mutate(layer = fct_relevel(layer, c("n", "sd", "mean", "median")) %>% fct_rev()) %>% 
  ggplot(aes(x = 80, y = layer, label = label)) +
  geom_text(size = 10 * 5/14, hjust = 1) +
  scale_x_continuous(limits = c(0,100)) +
  facet_grid(cols = vars(year)) +
  labs(y = "", x = NULL) +
  scale_y_discrete(labels = c("n" = "N", "mean" = "Mean", "median" = "Median")) +
  theme_minimal() +
  theme(axis.line = element_blank(), axis.ticks = element_blank(), axis.text.x = element_blank(),
        panel.grid = element_blank(), strip.text = element_blank())

p1 / p2 +  plot_layout(heights = c(5, 2.5))

ggsave("output/eth_pre_data-summary.pdf", units = "cm", width = 8, height = 8)

Mean and standard deviation for each item:

test_scores %>% 
  select(-class, -Total) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  rename(complete = complete_rate) %>% 
  # make the table wider, i.e. with separate columns for each year's results, with the year at the start of each column name
  pivot_wider(names_from = year, values_from = -c(skim_variable, year), names_glue = "{year}__{.value}") %>% 
  # put the columns in order by year
  select(sort(names(.))) %>% 
  select(skim_variable, everything()) %>% 
  # use GT to make the table look nice
  gt(rowname_col = "skim_variable") %>% 
  # group the columns from each year
  tab_spanner_delim(delim = "__") %>%
  fmt_number(columns = contains("numeric"), decimals = 2) %>%
  fmt_percent(columns = contains("complete"), decimals = 0) %>% 
  # change all the numeric.mean and numeric.sd column names to Mean and SD
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.mean"), label = "Mean") %>% deframe()
  ) %>% 
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.sd"), label = "SD") %>% deframe()
  ) %>%
  data_color(
    columns = contains("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
2017 2018
complete n_missing Mean SD complete n_missing Mean SD
z1 100% 0 0.95 0.22 100% 0 0.95 0.21
z2 100% 0 0.31 0.46 100% 0 0.28 0.45
z3 100% 0 0.62 0.48 100% 0 0.64 0.48
z4 100% 0 0.64 0.48 100% 0 0.62 0.48
z5 100% 0 0.47 0.50 100% 0 0.49 0.50
z6 100% 0 0.71 0.46 100% 0 0.73 0.44
z7 100% 0 0.71 0.46 100% 0 0.70 0.46
z8 100% 0 0.47 0.50 100% 0 0.50 0.50
z9 100% 0 0.46 0.50 100% 0 0.46 0.50
z10 100% 0 0.54 0.50 100% 0 0.54 0.50
z11 100% 0 0.57 0.49 100% 0 0.58 0.49
z12 100% 0 0.68 0.47 100% 0 0.71 0.45
z13 100% 0 0.37 0.48 100% 0 0.36 0.48
z14 100% 0 0.56 0.50 100% 0 0.57 0.49
z15 100% 0 0.46 0.50 100% 0 0.46 0.50
z16 100% 0 0.19 0.39 100% 0 0.20 0.40
z17 100% 0 0.59 0.49 100% 0 0.60 0.49
z18 100% 0 0.38 0.49 100% 0 0.36 0.48
z19 100% 0 0.35 0.48 100% 0 0.34 0.47
z20 100% 0 0.72 0.45 100% 0 0.72 0.45
z21 100% 0 0.51 0.50 100% 0 0.52 0.50
z22 100% 0 0.52 0.50 100% 0 0.52 0.50
z23 100% 0 0.41 0.49 100% 0 0.43 0.50
z24 100% 0 0.59 0.49 100% 0 0.58 0.49
z25 100% 0 0.54 0.50 100% 0 0.73 0.44
z26 100% 0 0.80 0.40 100% 0 0.80 0.40
z27 100% 0 0.39 0.49 100% 0 0.40 0.49
z28 100% 0 0.42 0.49 100% 0 0.46 0.50
z29 100% 0 0.48 0.50 100% 0 0.52 0.50
z30 100% 0 0.76 0.43 100% 0 0.73 0.44
z31 100% 0 0.38 0.48 100% 0 0.37 0.48
z32 100% 0 0.23 0.42 100% 0 0.20 0.40
z33 100% 0 0.78 0.42 100% 0 0.76 0.43
z34 100% 0 0.58 0.49 100% 0 0.61 0.49
z35 100% 0 0.32 0.47 100% 0 0.33 0.47
z36 100% 0 0.22 0.41 100% 0 0.23 0.42

2 Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

2.1 Inter-item correlations

If the test is unidimensional then we would expect student scores on pairs of items to be correlated.

This plot shows the correlations between scores on each pair of items:

item_scores <- test_scores %>% 
  select(-class, -year, -Total)

cor_ci <- psych::corCi(item_scores, plot = FALSE)

psych::cor.plot.upperLowerCi(cor_ci)

Almost all correlations are significantly different from 0 - the only one that is not is:

cor_ci$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
z1-z2 −0.002 0.053 0.073

Thus, the overall picture is that the item scores are well correlated with each other.

2.2 Dimensionality

We use factor analysis as another way to assess unidimensionality. We are looking to see whether a 1-factor solution is feasible.

structure <- check_factorstructure(item_scores)
n <- n_factors(item_scores)

2.2.1 Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.95).
  • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(630) = 30319.01, p < .001).

2.2.1.1 Method Agreement Procedure:

The choice of 6 dimensions is supported by 4 (21.05%) methods out of 19 (Optimal coordinates, Parallel analysis, Kaiser criterion, BIC).

plot(n)

summary(n) %>% gt()
n_Factors n_Methods
1 3
2 1
3 1
4 1
6 4
8 1
11 3
33 1
34 1
35 3
#n %>% tibble() %>% gt()
fa.parallel(item_scores, fa = "fa")

## Parallel analysis suggests that the number of factors =  9  and the number of components =  NA
pdf(file = "output/eth_pre_scree.pdf", width = 6, height = 4)
fa.parallel(item_scores, fa = "fa")
## Parallel analysis suggests that the number of factors =  9  and the number of components =  NA
dev.off()
## png 
##   2

This shows that both the 6-factor and 1-factor solutions are worth exploring.

2.2.2 1 Factor

We use the factanal function to fit a 1-factor model.

Note that this function cannot handle missing data, so any NA scores must be set to 0 for this analysis.

Show factanal output
fitfact <- factanal(item_scores,
                    factors = 1,
                    rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 1, rotation = "varimax")
## 
## Uniquenesses:
##   z1   z2   z3   z4   z5   z6   z7   z8   z9  z10  z11  z12  z13  z14  z15  z16 
## 0.95 0.95 0.75 0.79 0.72 0.74 0.79 0.79 0.61 0.65 0.67 0.88 0.91 0.73 0.87 0.81 
##  z17  z18  z19  z20  z21  z22  z23  z24  z25  z26  z27  z28  z29  z30  z31  z32 
## 0.78 0.78 0.78 0.63 0.59 0.61 0.64 0.73 0.94 0.69 0.63 0.64 0.85 0.81 0.78 0.78 
##  z33  z34  z35  z36 
## 0.88 0.86 0.79 0.84 
## 
## Loadings:
##  [1] 0.53 0.51 0.62 0.59 0.57 0.52 0.61 0.64 0.62 0.60 0.52 0.56 0.61 0.60     
## [16]      0.50 0.45 0.46 0.46 0.35 0.31 0.36 0.44 0.47 0.46 0.47      0.39 0.44
## [31] 0.46 0.47 0.34 0.38 0.46 0.40
## 
##                Factor1
## SS loadings       8.35
## Proportion Var    0.23
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 5447.17 on 594 degrees of freedom.
## The p-value is 0
load <- tidy(fitfact)

ggplot(load, aes(x = fl1, y = 0)) + 
  geom_point() + 
  geom_label_repel(aes(label = paste0("z", rownames(load))), show.legend = FALSE) +
  labs(x = "Factor 1", y = NULL,
       title = "Standardised Loadings", 
       subtitle = "Based upon correlation matrix") +
  theme_minimal()
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(item_info, by = "question") %>% 
  arrange(-factor_loading) %>% 
  gt() %>%
  data_color(
    columns = contains("factor"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
question factor_loading description
z21 0.6407106 (ln(sin))'
z9 0.6228304 trig.ineq.
z22 0.6223752 hyp.functions
z27 0.6076260 int(exp)
z20 0.6061690 (exp)'
z23 0.5998775 slope tangent
z28 0.5961137 Int = 0
z10 0.5878608 trig.identity
z11 0.5709488 period
z26 0.5590386 int(poly)
z5 0.5324571 logs
z14 0.5232019 limit
z24 0.5215894 IVT
z6 0.5086087 logs
z3 0.4972984 arithmetic rules
z19 0.4718772 quotient rule
z17 0.4705175 graph f'
z32 0.4683131 FtoC algebra
z18 0.4646006 product rule
z31 0.4643091 int(abs)
z8 0.4602708 sine pi/3
z7 0.4593494 graph translation
z35 0.4579597 intersect planes
z4 0.4547066 Easy ineq.
z30 0.4376910 FtoC graph
z16 0.4367742 diff.quotient
z36 0.4014553 parallel planes
z29 0.3875447 int even funct
z34 0.3765420 normal vector
z15 0.3603866 series
z12 0.3513586 rational exponents
z33 0.3415341 difference vectors
z13 0.3075858 ellipsoid
z25 0.2364578 velocity
z2 0.2342048 3D
z1 0.2277957 linear function

This factor appears to be dominated by “procedural calculus skills”: standard calculation of derivatives, integral and some trigonometry.

2.2.3 6 Factor

Here we also investigate the 6-factor solution, to see whether these factors are interpretable.

Show factanal output
fitfact6 <- factanal(item_scores, factors = 6, rotation = "varimax")
print(fitfact6, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores, factors = 6, rotation = "varimax")
## 
## Uniquenesses:
##   z1   z2   z3   z4   z5   z6   z7   z8   z9  z10  z11  z12  z13  z14  z15  z16 
## 0.91 0.81 0.74 0.75 0.68 0.74 0.72 0.76 0.53 0.64 0.64 0.87 0.80 0.71 0.82 0.72 
##  z17  z18  z19  z20  z21  z22  z23  z24  z25  z26  z27  z28  z29  z30  z31  z32 
## 0.69 0.41 0.00 0.47 0.53 0.59 0.60 0.63 0.91 0.58 0.60 0.61 0.83 0.71 0.76 0.75 
##  z33  z34  z35  z36 
## 0.84 0.70 0.56 0.70 
## 
## Loadings:
##     Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## z9   0.54    0.37                                  
## z23  0.50                                          
## z28  0.50                                          
## z24          0.52                                  
## z20                  0.59                          
## z18                          0.69                  
## z19                          0.95                  
## z35                                  0.58          
## z1                                                 
## z2                                           0.40  
## z3   0.31                                          
## z4           0.33                                  
## z5   0.38                                          
## z6                                                 
## z7           0.44                                  
## z8   0.39                                          
## z10  0.45                                          
## z11  0.43    0.36                                  
## z12                                                
## z13                                          0.34  
## z14  0.32                                          
## z15                                                
## z16  0.43                                          
## z17          0.48                                  
## z21  0.46            0.43                          
## z22  0.41            0.36                          
## z25                                                
## z26          0.34    0.48                          
## z27  0.45            0.32                          
## z29                                                
## z30          0.48                                  
## z31  0.33                                          
## z32  0.41                                          
## z33          0.33                                  
## z34                                  0.47          
## z36                                  0.43          
## 
##                Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings        3.5    2.65    1.68    1.62    1.26    0.98
## Proportion Var     0.1    0.07    0.05    0.05    0.04    0.03
## Cumulative Var     0.1    0.17    0.22    0.26    0.30    0.32
## 
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 932.36 on 429 degrees of freedom.
## The p-value is 2.36e-39
load6 <- tidy(fitfact6)

ggplot(load6, aes(x = fl1, y = fl2)) + 
  geom_point() + 
  geom_label_repel(aes(label = paste0("z", rownames(load))), show.legend = FALSE) +
  labs(x = "Factor 1", y = "Factor 2",
       title = "Standardised Loadings", 
       subtitle = "Based upon correlation matrix") +
  theme_minimal()

main_factors <- load6 %>% 
#  mutate(factorNone = 0.4) %>%  # add this to set the main factor to "None" where all loadings are below 0.4
  pivot_longer(names_to = "factor",
               cols = contains("fl")) %>% 
  mutate(value_abs = abs(value)) %>% 
  group_by(variable) %>% 
  top_n(1, value_abs) %>% 
  ungroup() %>% 
  transmute(main_factor = factor, variable)

load6 %>% 
  select(-uniqueness) %>% 
  # add the info about which is the main factor
  left_join(main_factors, by = "variable") %>%
  left_join(item_info %>% select(variable = question, description), by = "variable") %>% 
  arrange(main_factor) %>% 
  select(main_factor, everything()) %>% 
  # arrange adjectives by descending loading on main factor
  rowwise() %>% 
  mutate(max_loading = max(abs(c_across(starts_with("fl"))))) %>% 
  group_by(main_factor) %>% 
  arrange(-max_loading, .by_group = TRUE) %>% 
  select(-max_loading) %>% 
  # sort out the presentation
  rename("Main Factor" = main_factor, # the _ throws a latex error
         "Question" = variable) %>%
  mutate_at(
    vars(starts_with("fl")),
    ~ cell_spec(round(., digits = 3), bold = if_else(abs(.) > 0.4, T, F))
  ) %>% 
  kable(booktabs = T, escape = F, longtable = T) %>% 
  kableExtra::collapse_rows(columns = 1, valign = "top") %>%
  kableExtra::kable_styling(latex_options = c("repeat_header"))
Main Factor Question fl1 fl2 fl3 fl4 fl5 fl6 description
fl1 z9 0.543 0.371 0.045 0.1 0.13 0.086 trig.ineq.
fl1 z23 0.504 0.289 0.189 0.085 0.136 0.023 slope tangent
fl1 z28 0.503 0.292 0.145 0.094 0.147 0.048 Int = 0
fl1 z21 0.456 0.194 0.429 0.13 0.143 0.07 (ln(sin))’
fl1 z27 0.453 0.193 0.316 0.086 0.216 0.075 int(exp)
fl1 z10 0.447 0.239 0.234 0.104 0.084 0.182 trig.identity
fl1 z16 0.427 -0.017 0.141 0.129 0.128 0.217 diff.quotient
fl1 z11 0.426 0.357 0.127 0.065 0.082 0.15 period
fl1 z22 0.408 0.237 0.36 0.108 0.21 0.073 hyp.functions
fl1 z32 0.407 0.122 0.106 0.104 0.19 0.115 FtoC algebra
fl1 z8 0.386 0.145 0.138 0.12 0.058 0.186 sine pi/3
fl1 z5 0.379 0.182 0.2 0.093 0.114 0.278 logs
fl1 z31 0.325 0.262 0.059 0.07 0.178 0.151 int(abs)
fl1 z14 0.324 0.176 0.266 0.118 0.18 0.18 limit
fl1 z3 0.313 0.212 0.247 0.085 0.08 0.213 arithmetic rules
fl1 z6 0.293 0.268 0.257 0.072 0.122 0.136 logs
fl2 z24 0.272 0.516 0.074 0.079 0.134 0.049 IVT
fl2 z30 0.171 0.484 0.086 0.051 0.094 0.08 FtoC graph
fl2 z17 0.193 0.478 0.118 0.073 0.061 0.137 graph f’
fl2 z7 0.167 0.443 0.157 0.036 0.095 0.161 graph translation
fl2 z4 0.225 0.334 0.118 0.064 0.084 0.249 Easy ineq.
fl2 z33 0.101 0.328 0.089 0.036 0.113 0.158 difference vectors
fl2 z1 0.046 0.27 0.089 0.027 0.029 0.06 linear function
fl2 z25 0.056 0.264 0.099 0.009 0.055 0.054 velocity
fl2 z12 0.185 0.191 0.143 0.083 0.072 0.168 rational exponents
fl3 z20 0.286 0.268 0.593 0.09 0.115 0.059 (exp)’
fl3 z26 0.221 0.338 0.476 0.11 0.142 0.002 int(poly)
fl3 z29 0.23 0.126 0.27 0.093 0.145 0.03 int even funct
fl4 z19 0.227 0.071 0.129 0.951 0.101 0.085 quotient rule
fl4 z18 0.211 0.138 0.135 0.692 0.114 0.106 product rule
fl5 z35 0.216 0.133 0.114 0.104 0.583 0.102 intersect planes
fl5 z34 0.109 0.185 0.169 0.057 0.47 0.006 normal vector
fl5 z36 0.223 0.106 0.06 0.045 0.43 0.212 parallel planes
fl6 z2 0.093 0.117 0.011 0.046 0.024 0.403 3D
fl6 z13 0.099 0.255 -0.006 0.044 0.106 0.339 ellipsoid
fl6 z15 0.154 0.233 0.084 0.023 0.143 0.28 series

Here, the first factor seems better described by “abstract understanding of calculus”. The second is based on “graphical understanding of calculus”.

The third and fourth factors are “procedural calculus skills”, with the product and quotient rules distinguished in the fourth factor.

The fifth and sixth factors have a geometrical flavour, with the fifth factor in particular being about “vector geometry”.

2.2.4 Conclusion

The 6-factor solution does seem to be interpretable, however the 1-factor solution is still reasonable. In terms of explaining the variance in the data, the 1-factor solution does a good job at capturing a large chunk:

# Compute the proportion of variance explained; see https://stackoverflow.com/a/58159992/17459126
x1 <- loadings(fitfact)
propvar1 <- colSums(x1^2)/nrow(x1)

rbind(`Proportion Var` = propvar1, `Cumulative Var` = cumsum(propvar1))
##                  Factor1
## Proportion Var 0.2320524
## Cumulative Var 0.2320524
x6 <- loadings(fitfact6)
propvar6 <- colSums(x6^2)/nrow(x6)

rbind(`Proportion Var` = propvar6, `Cumulative Var` = cumsum(propvar6))
##                   Factor1    Factor2    Factor3    Factor4    Factor5
## Proportion Var 0.09717865 0.07370191 0.04679681 0.04500204 0.03500279
## Cumulative Var 0.09717865 0.17088056 0.21767737 0.26267941 0.29768220
##                   Factor6
## Proportion Var 0.02729802
## Cumulative Var 0.32498023

Therefore, we are justified in proceeding with a unidimensional IRT model.

3 Fitting 2 parameter logistic MIRT model

We can fit a Multidimensional Item Response Theory (mirt) model. From the function definition:

mirt fits a maximum likelihood (or maximum a posteriori) factor analysis model to any mixture of dichotomous and polytomous data under the item response theory paradigm using either Cai's (2010) Metropolis-Hastings Robbins-Monro (MHRM) algorithm.

The process is to first fit the model, and save the result as a model object that we can then parse to get tabular or visual displays of the model that we might want. When fitting the model, we have the option to specify a few arguments, which then get interpreted as parameters to be passed to the model.

Show model fitting output
fit_2pl <- mirt(
  data = item_scores, # just the columns with question scores
  model = 1,          # number of factors to extract
  itemtype = "2PL",   # 2 parameter logistic model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -66668.278, Max-Change: 0.71582
Iteration: 2, Log-Lik: -65483.178, Max-Change: 0.26188
Iteration: 3, Log-Lik: -65316.598, Max-Change: 0.14767
Iteration: 4, Log-Lik: -65249.858, Max-Change: 0.11072
Iteration: 5, Log-Lik: -65212.345, Max-Change: 0.09658
Iteration: 6, Log-Lik: -65190.906, Max-Change: 0.06270
Iteration: 7, Log-Lik: -65178.102, Max-Change: 0.06095
Iteration: 8, Log-Lik: -65169.976, Max-Change: 0.03668
Iteration: 9, Log-Lik: -65165.338, Max-Change: 0.03883
Iteration: 10, Log-Lik: -65162.425, Max-Change: 0.02443
Iteration: 11, Log-Lik: -65160.764, Max-Change: 0.01661
Iteration: 12, Log-Lik: -65159.750, Max-Change: 0.01247
Iteration: 13, Log-Lik: -65158.047, Max-Change: 0.00237
Iteration: 14, Log-Lik: -65158.024, Max-Change: 0.00195
Iteration: 15, Log-Lik: -65158.012, Max-Change: 0.00168
Iteration: 16, Log-Lik: -65157.995, Max-Change: 0.00082
Iteration: 17, Log-Lik: -65157.990, Max-Change: 0.00089
Iteration: 18, Log-Lik: -65157.986, Max-Change: 0.00072
Iteration: 19, Log-Lik: -65157.976, Max-Change: 0.00018
Iteration: 20, Log-Lik: -65157.975, Max-Change: 0.00020
Iteration: 21, Log-Lik: -65157.975, Max-Change: 0.00019
Iteration: 22, Log-Lik: -65157.974, Max-Change: 0.00012
Iteration: 23, Log-Lik: -65157.973, Max-Change: 0.00012
Iteration: 24, Log-Lik: -65157.973, Max-Change: 0.00011
Iteration: 25, Log-Lik: -65157.973, Max-Change: 0.00009
## 
## Calculating information matrix...

3.1 Local independence

We compute Yen’s \(Q_3\) (1984) to check for any dependence between items after controlling for \(\theta\). This gives a score for each pair of items, with scores above 0.2 regarded as problematic (see DeMars, p. 48).

residuals_2pl  %>% as.matrix() %>% 
  corrplot::corrplot(type = "upper")

This shows that most item pairs are independent, with only one pair showing cause for concern:

residuals_2pl %>%
  rownames_to_column(var = "q1") %>%
  as_tibble() %>% 
  pivot_longer(cols = starts_with("z"), names_to = "q2", values_to = "Q3_score") %>% 
  filter(abs(Q3_score) > 0.2) %>% 
  filter(parse_number(q1) < parse_number(q2)) %>%
  gt()
q1 q2 Q3_score
z18 z19 0.6740812
z34 z35 0.2109123

Items z18 and z19 are on the product and quotient rules.

Given that this violation of the local independence assumption is very mild, we proceed using this model.

3.2 Model parameters

We then compute factor score estimates and augment the existing data frame with these estimates, to keep everything in one place. To do the estimation, we use the fscores() function from the mirt package which takes in a computed model object and computes factor score estimates according to the method specified. We will use the EAP method for factor score estimation, which is the “expected a-posteriori” method, the default. We specify it explicitly below, but the results would have been the same if we omitted specifying the method argument since it’s the default method the function uses.

test_scores_with_2pl_ability <- test_scores %>%
  bind_cols(fscores(fit_2pl, method = "EAP"))

We can also calculate the model coefficient estimates using a generic function coef() which is used to extract model coefficients from objects returned by modeling functions. We will set the IRTpars argument to TRUE, which means slope intercept parameters will be converted into traditional IRT parameters.

coefs_2pl <- coef(fit_2pl, IRTpars = TRUE)

The resulting object coefs is a list, with one element for each question, and an additional GroupPars element that we won’t be using. The output is a bit long, so we’re only showing a few of the elements here:

coefs_2pl[1:3]
## $z1
##                a         b  g  u
## par     1.219104 -2.946041  0  1
## CI_2.5  1.024917 -3.298279 NA NA
## CI_97.5 1.413291 -2.593803 NA NA
## 
## $z2
##                 a        b  g  u
## par     0.5612745 1.670103  0  1
## CI_2.5  0.4752132 1.405353 NA NA
## CI_97.5 0.6473358 1.934852 NA NA
## 
## $z3
##                a          b  g  u
## par     1.354723 -0.5385656  0  1
## CI_2.5  1.236060 -0.6126183 NA NA
## CI_97.5 1.473386 -0.4645129 NA NA
# coefs_2pl[35:37]

Let’s take a closer look at the first element:

coefs_2pl[1]
## $z1
##                a         b  g  u
## par     1.219104 -2.946041  0  1
## CI_2.5  1.024917 -3.298279 NA NA
## CI_97.5 1.413291 -2.593803 NA NA

In this output:

  • a is discrimination
  • b is difficulty
  • endpoints of the 95% confidence intervals are also shown

To make this output more user friendly, we have produced a function tidy_mirt_coeffs that transforms it into a tidy table.

source("fn_tidy_mirt_coefs.R")

tidy_2pl <- tidy_mirt_coefs(coefs_2pl) %>% 
  # keep only the a and b parameters from the 2PL model
  filter(par %in% c("a", "b"))

Here is the result:

tidy_2pl %>%
  mutate(ci = str_glue("[{round(CI_2.5, 2)}, {round(CI_97.5, 2)}]")) %>% 
  select(-starts_with("CI_")) %>% 
  pivot_wider(names_from = "par", values_from = c(est, ci), names_glue = "{par}_{.value}") %>% 
  gt() %>% 
  fmt_number(columns = contains("_est"), decimals = 2) %>%
  data_color(
    columns = contains("a_est"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("b_est"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = contains("b_")) %>%
  cols_label(
    a_est = "Est.",
    b_est = "Est.",
    a_ci = "95% CI",
    b_ci = "95% CI"
  )
Question Discrimination Difficulty
Est. 95% CI Est. 95% CI
z1 1.22 [1.02, 1.41] −2.95 [-3.3, -2.59]
z2 0.56 [0.48, 0.65] 1.67 [1.41, 1.93]
z3 1.35 [1.24, 1.47] −0.54 [-0.61, -0.46]
z4 1.18 [1.07, 1.29] −0.59 [-0.67, -0.5]
z5 1.47 [1.34, 1.59] 0.08 [0.02, 0.14]
z6 1.56 [1.42, 1.7] −0.86 [-0.94, -0.78]
z7 1.29 [1.17, 1.41] −0.88 [-0.97, -0.8]
z8 1.16 [1.06, 1.27] 0.07 [-0.01, 0.14]
z9 2.02 [1.86, 2.18] 0.13 [0.07, 0.18]
z10 1.77 [1.63, 1.91] −0.14 [-0.2, -0.09]
z11 1.68 [1.55, 1.82] −0.28 [-0.34, -0.22]
z12 0.89 [0.79, 0.99] −1.09 [-1.22, -0.96]
z13 0.72 [0.63, 0.81] 0.85 [0.72, 0.99]
z14 1.44 [1.31, 1.56] −0.26 [-0.32, -0.19]
z15 0.85 [0.76, 0.94] 0.21 [0.12, 0.3]
z16 1.54 [1.39, 1.69] 1.30 [1.19, 1.4]
z17 1.23 [1.12, 1.34] −0.40 [-0.48, -0.33]
z18 1.24 [1.12, 1.35] 0.55 [0.47, 0.63]
z19 1.31 [1.19, 1.43] 0.64 [0.56, 0.72]
z20 2.33 [2.13, 2.53] −0.74 [-0.8, -0.68]
z21 2.14 [1.97, 2.31] −0.05 [-0.1, 0.01]
z22 2.00 [1.84, 2.16] −0.05 [-0.11, 0]
z23 1.88 [1.72, 2.03] 0.27 [0.21, 0.33]
z24 1.44 [1.32, 1.56] −0.31 [-0.38, -0.25]
z25 0.53 [0.45, 0.62] −1.13 [-1.33, -0.92]
z26 2.42 [2.2, 2.65] −1.04 [-1.1, -0.97]
z27 1.99 [1.83, 2.16] 0.35 [0.29, 0.41]
z28 1.85 [1.7, 2] 0.21 [0.15, 0.27]
z29 0.91 [0.82, 1.01] 0.00 [-0.09, 0.09]
z30 1.26 [1.14, 1.38] −1.08 [-1.19, -0.98]
z31 1.25 [1.14, 1.36] 0.54 [0.46, 0.62]
z32 1.60 [1.45, 1.75] 1.15 [1.06, 1.24]
z33 0.94 [0.83, 1.04] −1.51 [-1.67, -1.35]
z34 0.91 [0.81, 1] −0.51 [-0.61, -0.42]
z35 1.28 [1.16, 1.4] 0.73 [0.65, 0.82]
z36 1.25 [1.12, 1.37] 1.28 [1.16, 1.39]
tidy_2pl_wide <- tidy_2pl %>% 
  pivot_wider(names_from = "par", values_from = c(est, CI_2.5, CI_97.5), names_glue = "{par}_{.value}")

wright_plot <- tidy_2pl_wide %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(
    x = a_est,
    y = b_est
  )) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0, alpha = 0.5) +
  geom_errorbar(aes(xmin = a_CI_2.5, xmax = a_CI_97.5), width = 0, alpha = 0.5) +
  geom_text_repel(aes(label = Question), alpha = 0.5) +
  geom_point() +
  theme_minimal() +
  labs(x = "Discrimination",
       y = "Difficulty")

wright_plot

tidy_2pl_wide %>% 
  write_csv("output/eth_pre_2pl-results.csv")

3.3 Comparing years and classes

Do students from different programmes of study have different distributions of ability?

3.3.1 Differences between years

Compare the distribution of abilities in the year groups (though in this case there is only one).

ggplot(test_scores_with_2pl_ability, aes(F1, fill = as.factor(year), colour = as.factor(year))) +
  geom_density(alpha=0.5) + 
  scale_x_continuous(limits = c(-3.5,3.5)) +
  labs(title = "Density plot", 
       subtitle = "Ability grouped by year of taking the test", 
       x = "Ability", y = "Density",
       fill = "Year", colour = "Year") +
  theme_minimal()

3.3.2 Differences between classes

Compare the distribution of abilities in the various classes.

ggplot(test_scores_with_2pl_ability, aes(x = F1, y = class, colour = class, fill = class)) +
  geom_density_ridges(alpha = 0.5) + 
  scale_x_continuous(limits = c(-3.5,3.5)) +
  guides(fill = FALSE, colour = FALSE) +
  labs(title = "Density plot", 
       subtitle = "Ability grouped by class of taking the test", 
       x = "Ability", y = "Class") +
  theme_minimal()
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Picking joint bandwidth of 0.194

3.4 Information curves

theta <- seq(-6, 6, by=0.05)

info_matrix <- testinfo(fit_2pl, theta, individual = TRUE)
colnames(info_matrix) <- item_info %>% pull(question)
item_info_data <- info_matrix %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "item", values_to = "info_y") %>% 
  left_join(item_info %>% select(item = question), by = "item") %>% 
  mutate(item = fct_reorder(item, parse_number(item)))

3.4.1 Test information curve

item_info_data %>% 
  group_by(theta) %>% 
  summarise(info_y = sum(info_y)) %>% 
  ggplot(aes(x = theta, y = info_y)) +
  geom_line() +
  labs(x = "Ability", y = "Information", title = "ETH s21t") +
  theme_minimal()

ggsave("output/eth_pre_info.pdf", width = 10, height = 6, units = "cm")

This shows that the information given by the test is focused reasonably centrally around the mean ability level.

3.4.2 Item information curves

Breaking this down by question helps to highlight those questions that are most/least informative:

item_info_data %>% 
  ggplot(aes(x = theta, y = info_y, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Information") +
  theme_minimal()

3.5 Total information

Using mirt’s areainfo function, we can find the total area under the information curves.

irt_info <- areainfo(fit_2pl, c(-4,4))
irt_info %>% gt()
LowerBound UpperBound Info TotalInfo Proportion nitems
-4 4 49.13213 50.42239 0.9744109 36

This shows that the total information in all items is 50.4223874.

3.5.1 Information by item

tidy_info <- item_info %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(fit_2pl,
               c(-4, 4),
               which.items = .x) %>% pull(TotalInfo)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
question description Information
z26 int(poly) 2.4238253
z20 (exp)' 2.3258216
z21 (ln(sin))' 2.1365919
z9 trig.ineq. 2.0158405
z22 hyp.functions 1.9989907
z27 int(exp) 1.9938718
z23 slope tangent 1.8750694
z28 Int = 0 1.8501832
z10 trig.identity 1.7675098
z11 period 1.6839435
z33 difference vectors 1.6041930
z6 logs 1.5577381
z16 diff.quotient 1.5386847
z5 logs 1.4663312
z24 IVT 1.4390034
z14 limit 1.4350697
z3 arithmetic rules 1.3547187
z19 quotient rule 1.3093576
z7 graph translation 1.2867712
z30 FtoC graph 1.2809342
z31 int(abs) 1.2631005
z32 FtoC algebra 1.2503874
z34 normal vector 1.2462025
z18 product rule 1.2373800
z17 graph f' 1.2268893
z1 linear function 1.2188793
z4 Easy ineq. 1.1804333
z8 sine pi/3 1.1623303
z35 intersect planes 0.9353326
z29 int even funct 0.9128694
z36 parallel planes 0.9058856
z12 rational exponents 0.8887859
z15 series 0.8494321
z13 ellipsoid 0.7169794
z2 3D 0.5552891
z25 velocity 0.5277613

Restricting instead to the range \(-2\leq\theta\leq2\):

tidy_info <- item_info %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(fit_2pl,
               c(-2, 2),
               which.items = .x) %>% pull(Info)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
question description Information
z26 int(poly) 2.2090139
z20 (exp)' 2.2045246
z21 (ln(sin))' 2.0775693
z9 trig.ineq. 1.9433205
z22 hyp.functions 1.9265566
z27 int(exp) 1.9039799
z23 slope tangent 1.7785873
z28 Int = 0 1.7545283
z10 trig.identity 1.6643308
z11 period 1.5605291
z5 logs 1.3171858
z6 logs 1.3144395
z24 IVT 1.2724263
z14 limit 1.2722619
z33 difference vectors 1.2662362
z3 arithmetic rules 1.1482188
z16 diff.quotient 1.1403300
z19 quotient rule 1.0806177
z30 FtoC graph 1.0322320
z32 FtoC algebra 1.0269435
z17 graph f' 1.0141086
z18 product rule 1.0104220
z7 graph translation 1.0086145
z8 sine pi/3 0.9547704
z4 Easy ineq. 0.9399399
z31 int(abs) 0.9357119
z34 normal vector 0.8652332
z29 int even funct 0.6597833
z36 parallel planes 0.6349100
z15 series 0.5847449
z12 rational exponents 0.5619252
z35 intersect planes 0.5401834
z13 ellipsoid 0.4171787
z1 linear function 0.2895158
z25 velocity 0.2433460
z2 3D 0.2430917

3.6 Response curves

These show the probability of a correct response at each ability level:

trace_data <- probtrace(fit_2pl, theta) %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "level", values_to = "y") %>% 
  separate(level, into = c("item", NA, "score"), sep = "\\.") %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  filter(score == 1)

trace_data %>% 
  ggplot(aes(x = theta, y = y, colour = score)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1, guide = "none") +
  facet_wrap(vars(item)) +
  labs(y = "Probability of correct response") +
  theme_minimal()

These can all be overlaid on a single plot:

plt <- trace_data %>% 
  ggplot(aes(x = theta, y = y, colour = item, text = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  labs(y = "Probability of correct response") +
  theme_minimal()

ggplotly(plt, tooltip = "text")
ggsave(plot = plt, file = "output/eth_pre_iccs-superimposed.pdf", width = 20, height = 14, units = "cm")

Here we highlight some items to illustrate the role of the difficulty and discrimination parameters

highlight_items <- c("z1", "z2", "z26", "z27")
item_colours <- c(viridis::viridis_pal(option = "plasma", end = 0.8, direction = -1)(length(highlight_items)), "black")

wright_plot_shaded <- tidy_2pl_wide %>%
  mutate(item = Question) %>%
  mutate(
    item_colour = case_when(item %in% highlight_items ~ item,
                            TRUE ~ "ignore"),
    item_size = if_else(item %in% highlight_items, 4, 3)
  ) %>% 
  ggplot(aes(
    x = a_est,
    y = b_est,
    colour = item_colour
  )) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0, alpha = 0.5) +
  geom_errorbar(aes(xmin = a_CI_2.5, xmax = a_CI_97.5), width = 0, alpha = 0.5) +
  geom_text_repel(aes(label = Question, size = item_size), alpha = 0.5) +
  geom_point() +
  scale_colour_manual("item", values = item_colours, limits = highlight_items, guide = "none") +
  scale_size(guide = "none", range = c(2.5, 4.5)) +
  theme_minimal() +
  coord_flip() +
  labs(x = "Discrimination",
       y = "Difficulty")

trace_examples <- trace_data %>% 
  filter(item %in% highlight_items) %>% 
  ggplot(aes(x = theta, y = y, colour = item)) +
  geom_line() +
  # ggrepel::geom_label_repel(
  #   data = trace_data %>% filter(item %in% highlight_items, y > 0.5) %>% group_by(item) %>% slice_min(y, n = 1),
  #   aes(label = item),
  #   box.padding = 0.5,
  #   min.segment.length = Inf,
  #   show.legend = FALSE
  # ) +
  geom_label(
    data = trace_data %>% filter(item %in% c("z1", "z27"), y > 0.45) %>% group_by(item) %>% slice_min(y, n = 1),
    aes(label = item),
    show.legend = FALSE
  ) +
  geom_label(
    data = trace_data %>% filter(item %in% c("z2", "z26"), y > 0.55) %>% group_by(item) %>% slice_min(y, n = 1),
    aes(label = item),
    show.legend = FALSE
  ) +
  scale_colour_viridis_d("item", option = "plasma", end = 0.8, direction = -1, guide = "none") +
  labs(x = "Ability", y = "Probability of correct response") +
  theme_minimal()

combined_plot <- patchwork::wrap_plots(wright_plot_shaded, trace_examples)
ggsave("output/eth_pre_wrightmap_highlight.pdf", plot = combined_plot, units = "cm", width = 18, height = 9)
combined_plot

removed_items <- read_csv("data-eth/eth-metadata.csv") %>% filter(is.na(post)) %>% mutate(pre = str_replace(pre, "A", "z")) %>% select(pre) %>% deframe()
highlight_removed_items <- trace_data %>% 
  mutate(highlight_item = item %in% removed_items) %>% 
  mutate(line_width = ifelse(highlight_item, 1, 0.5))

highlight_removed_items %>% 
  ggplot(aes(x = theta, y = y, colour = item, text = item, alpha = highlight_item)) +
  geom_line(aes(size = highlight_item)) +
  #geom_point(data = highlight_removed_items %>% filter(highlight_item == TRUE, theta == 0)) +
  ggrepel::geom_label_repel(
    #data = highlight_removed_items %>% filter(highlight_item == TRUE) %>% group_by(item) %>% mutate(item_num = cur_group_id()) %>% ungroup() %>% mutate(threshold = seq(5, -5, len=max(item_num))[item_num]) %>% filter(theta >= threshold) %>% group_by(item_num) %>% slice(n=1),
    #data = highlight_removed_items %>% filter(highlight_item == TRUE, y >= 0.5) %>% group_by(item) %>% slice_min(y, n = 1),
    data = highlight_removed_items %>% filter(highlight_item == TRUE, theta == 0),
    aes(label = item),
    box.padding = 0.5,
    max.overlaps = Inf,
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  scale_colour_viridis_d("Question", option = "plasma", end = 0.8, direction = -1) +
  scale_size_manual(values = c("FALSE" = 0.6, "TRUE" = 0.9), guide = "none") +
  scale_alpha_discrete(guide = "none", range = c(0.2, 1)) +
  labs(x = "Ability", y = "Expected score") +
  theme_minimal() +
  theme(legend.position="bottom",#legend.title=element_blank(),
      legend.margin = margin(0, 0, 0, 0),
      legend.spacing.x = unit(1, "mm"),
      legend.spacing.y = unit(0, "mm")) +
  guides(colour = guide_legend(nrow = 4))

ggsave(file = "output/eth_pre_iccs-highlight.pdf", width = 16, height = 12, units = "cm")

4 Checking 3PL

The 3 parameter IRT model adds a third “guessing” parameter for each item, which gives the lower asymptote of the response curve.

Given that the test offered an “I don’t know” option (which we have mapped to a score of 0), we would expect that students are less likely to be guessing when answering these multiple-choice questions. So, we would expect to see the lower aymptotes being closer to 0 than would ordinarily be expected for a 4- or 5-option MCQ.

fit_3pl <- mirt(
  data = item_scores, # just the columns with question scores
  model = 1,          # number of factors to extract
  itemtype = "3PL",   # 3 parameter logistic model
  SE = TRUE           # estimate standard errors
  )

residuals_3pl <- residuals(fit_3pl, type = "Q3") %>% data.frame

coefs_3pl <- coef(fit_3pl, IRTpars = TRUE)

test_scores_with_3pl_ability <- test_scores %>%
  bind_cols(fscores(fit_3pl, method = "EAP"))

4.1 Local independence

We compute Yen’s \(Q_3\) (1984) to check for any dependence between items after controlling for \(\theta\). This gives a score for each pair of items, with scores above 0.2 regarded as problematic (see DeMars, p. 48).

residuals_3pl  %>% as.matrix() %>% 
  corrplot::corrplot(type = "upper")

This shows that most item pairs are independent, with only a couple of pairs showing cause for concern:

residuals_3pl %>%
  rownames_to_column(var = "q1") %>%
  as_tibble() %>% 
  pivot_longer(cols = starts_with("z"), names_to = "q2", values_to = "Q3_score") %>% 
  filter(abs(Q3_score) > 0.2) %>% 
  filter(parse_number(q1) < parse_number(q2)) %>%
  gt()
q1 q2 Q3_score
z18 z19 0.6745194
z34 z35 0.2115169
  • Items A18 and A19 are on the product and quotient rules, with an identical structure in each case (giving all the values that need to be used in the relevant formula).

  • Items A34 and A35 are are both about planes in vector geometry.

Given that this violation of the local independence assumption is very mild, we proceed using this model.

4.2 Model parameters

tidy_3pl <- tidy_mirt_coefs(coefs_3pl) %>% 
  filter(par %in% c("a", "b", "g"))

Here is a nicely formatted table of the result:

tidy_3pl %>%
  mutate(ci = str_glue("[{round(CI_2.5, 2)}, {round(CI_97.5, 2)}]")) %>% 
  select(-starts_with("CI_")) %>% 
  pivot_wider(names_from = "par", values_from = c(est, ci), names_glue = "{par}_{.value}") %>% 
  gt() %>% 
  fmt_number(columns = contains("_est"), decimals = 2) %>%
  data_color(
    columns = contains("a_est"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("b_est"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("g_est"),
    colors = scales::col_numeric(palette = c("Reds"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = contains("b_")) %>%
  tab_spanner(label = "Guessing", columns = contains("g_")) %>%
  cols_label(
    a_est = "Est.",
    b_est = "Est.",
    g_est = "Est.",
    a_ci = "95% CI",
    b_ci = "95% CI",
    g_ci = "95% CI"
  )
Question Discrimination Difficulty Guessing
Est. 95% CI Est. 95% CI Est. 95% CI
z1 1.18 [0.99, 1.37] −3.01 [-3.44, -2.58] 0.02 [-0.17, 0.21]
z2 0.83 [0.51, 1.14] 1.81 [1.55, 2.07] 0.10 [0.03, 0.18]
z3 1.66 [1.38, 1.94] −0.23 [-0.42, -0.05] 0.15 [0.06, 0.24]
z4 1.38 [1.14, 1.62] −0.30 [-0.55, -0.06] 0.13 [0.02, 0.24]
z5 1.74 [1.49, 2] 0.22 [0.11, 0.33] 0.07 [0.02, 0.12]
z6 1.66 [1.4, 1.92] −0.72 [-0.94, -0.49] 0.08 [-0.04, 0.21]
z7 1.29 [1.12, 1.47] −0.86 [-1.11, -0.62] 0.01 [-0.11, 0.13]
z8 1.19 [1.01, 1.37] 0.09 [-0.07, 0.26] 0.01 [-0.06, 0.08]
z9 2.04 [1.82, 2.26] 0.14 [0.07, 0.21] 0.00 [-0.02, 0.03]
z10 2.08 [1.81, 2.35] −0.01 [-0.1, 0.09] 0.07 [0.03, 0.12]
z11 1.90 [1.64, 2.15] −0.15 [-0.27, -0.03] 0.07 [0.01, 0.13]
z12 0.96 [0.7, 1.22] −0.81 [-1.58, -0.04] 0.11 [-0.18, 0.41]
z13 0.88 [0.63, 1.13] 1.04 [0.79, 1.3] 0.08 [-0.01, 0.17]
z14 1.64 [1.4, 1.88] −0.09 [-0.23, 0.05] 0.08 [0.01, 0.15]
z15 0.98 [0.72, 1.24] 0.42 [0.08, 0.76] 0.08 [-0.05, 0.2]
z16 1.73 [1.45, 2] 1.28 [1.18, 1.37] 0.01 [0, 0.03]
z17 1.23 [1.12, 1.34] −0.40 [-0.47, -0.32] 0.00 [-0.01, 0.01]
z18 1.25 [1.14, 1.37] 0.55 [0.47, 0.63] 0.00 [0, 0]
z19 1.33 [1.21, 1.45] 0.64 [0.56, 0.72] 0.00 [0, 0]
z20 2.32 [2.02, 2.62] −0.72 [-0.84, -0.6] 0.01 [-0.07, 0.08]
z21 2.15 [1.98, 2.32] −0.03 [-0.09, 0.02] 0.00 [0, 0]
z22 2.07 [1.86, 2.27] −0.02 [-0.09, 0.05] 0.01 [-0.01, 0.04]
z23 1.89 [1.74, 2.05] 0.28 [0.22, 0.33] 0.00 [0, 0]
z24 1.44 [1.32, 1.57] −0.30 [-0.37, -0.24] 0.00 [-0.01, 0.01]
z25 0.54 [0.45, 0.62] −1.11 [-1.36, -0.85] 0.00 [-0.04, 0.05]
z26 2.39 [2.16, 2.61] −1.03 [-1.11, -0.96] 0.00 [-0.01, 0.01]
z27 2.02 [1.85, 2.18] 0.36 [0.3, 0.41] 0.00 [0, 0]
z28 1.87 [1.72, 2.02] 0.22 [0.16, 0.28] 0.00 [0, 0.01]
z29 0.92 [0.83, 1.01] 0.01 [-0.08, 0.09] 0.00 [0, 0.01]
z30 1.25 [1.12, 1.38] −1.08 [-1.21, -0.95] 0.00 [-0.04, 0.05]
z31 1.44 [1.2, 1.68] 0.62 [0.51, 0.74] 0.04 [0, 0.09]
z32 1.63 [1.48, 1.79] 1.14 [1.05, 1.23] 0.00 [0, 0]
z33 0.93 [0.81, 1.04] −1.50 [-1.78, -1.21] 0.01 [-0.1, 0.12]
z34 0.91 [0.81, 1.01] −0.50 [-0.63, -0.37] 0.00 [-0.03, 0.04]
z35 1.31 [1.15, 1.46] 0.74 [0.64, 0.83] 0.00 [-0.02, 0.03]
z36 1.64 [1.32, 1.97] 1.28 [1.17, 1.38] 0.04 [0.02, 0.07]
tidy_3pl_wide <- tidy_3pl %>% 
  pivot_wider(names_from = "par", values_from = c(est, CI_2.5, CI_97.5), names_glue = "{par}_{.value}")

tidy_3pl_wide %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(
    x = qnum,
    y = g_est
  )) +
  geom_errorbar(aes(ymin = g_CI_2.5, ymax = g_CI_97.5), width = 0, alpha = 0.5) +
  geom_text_repel(aes(label = Question), alpha = 0.5) +
  geom_point() +
  theme_minimal() +
  labs(x = "Question",
       y = "Guessing parameter")

tidy_3pl_wide %>% 
  write_csv("output/eth_pre_3pl-results.csv")

This shows that overall guessing appears to be rare; certainly below the level that might be expected if students of very low ability were simply choosing an answer at random.

About this report

This report supports the analysis in the following paper:

[citation needed]

Packages

In this analysis we used the following packages. You can learn more about each one by clicking on the links below.

  • mirt: For IRT analysis
  • psych: For factor analysis
  • tidyverse: For data wrangling and visualisation
  • reshape: For reshaping nested lists
  • vroom: For reading in many files at once
  • broom: For tidying model output
  • fs: For file system operations
  • gt: For formatting tables
  • knitr: For markdown tables
  • ggrepel: For labelling points without overlap
  • skimr: For data frame level summary
  • ggridges: For ridge plots